Topics in R

Information Visualization AGA0513

Rafael S. de Souza

4/28/2017

A statistical computing environment

Some functionalities of R

Bayesian Inference Machine Learning Social Sciences
Computational Physics Medical Image Analysis Spatial Data
Cluster Analysis Multivariate Statistics Statistical Geneticss
Differential Equations Natural Language Survival Analysi
Econometrics Numerical Mathematic Time Series Analysis
Environmetrics Optimization Visualization
Environmetrics Pharmacokinetic Web Technologies
Extreme Value Analysis Phylogenetics
Empirical Finance Probability Distributions
Functional Data Analysis Psychometric

Required packages

Some packages to run the examples.

If not installed run first

install.packages(c("ggplot2","reshape2","circlize","ggdendro"),dependencies = T)

Otherwise

require(ggplot2);
require(reshape2);require(d3heatmap);require(circlize);require(ggdendro);
require(PerformanceAnalytics)

Basic commands

1+1
## [1] 2
x <- 2
for (i in 1:5){
print(x+i)  
}
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7

Basic commands

x <- rnorm(100)
hist(x)

Simple regression model

set.seed(1056)              # set seed to replicate example
nobs= 150                   # number of obs in model 
x1 <- runif(nobs,0,5)       # random uniform variable
mu <- 1 + 5 * x1            # linear predictor, xb
y <- rnorm(nobs, mu, sd=1)  # create y as adjusted random normal variate 
fit <- lm(y ~ x1)           # Normal Fit 
summary(fit)
Fitting linear model: y ~ x1
  Estimate Std. Error t value Pr(>|t|)
x1 5.025 0.05288 95.03 1.494e-134
(Intercept) 1.006 0.1541 6.528 9.971e-10

Plot results-basic R

xx <- seq(0,5,length=200)
ypred <- predict(fit,newdata=list(x1=xx),type="response")     # Prediction from the model 

plot(x1,y,pch=19,col="red")                                   # Plot regression line 
lines(xx,ypred,col='cyan',lwd=4,lty=2)

segments(x1,fitted(fit),x1,y,lwd=2,col="gray")                # add the residuals

Plot results-ggplot

dat <- data.frame(x1,y)
ggplot(data=dat,aes(x =x1, y = y)) + 
  geom_point() +
  stat_smooth(method = "lm", col = "red")

Data Exploration

Read and display data in table format

d <- read.csv("exoplanets.csv",header = T)
d <- d[complete.cases(d),]
head(d)

Visualization-Scatter plot

Visualization-Boxplot

Visualization-Boxplot

Visualization-Violin plot

Visualization-Linear Fit

Visualization-Histograms

Visualization-Kernel Density

Visualization-Kernel Density

By groups

p2 <- ggplot(data = d, aes(x = Radius,group=Discovery_method,fill=Discovery_method)) +
      geom_density()+ scale_fill_discrete(name="")
ggplotly(p2)

Visualization-Kernel Density

By groups

p2 <- ggplot(data = d, aes(x = Radius,group=Discovery_method,fill=Discovery_method)) +
      geom_density(adjust=1.5, position="fill")+ scale_fill_discrete(name="")
ggplotly(p2)

Visualization-correlation matrix

dcor <- cor(d[,c(2,3,5,6,7)])
print(dcor)
Table continues below
  Radius Period star_mass star_radius
Radius 1 0.03116 0.3415 0.3532
Period 0.03116 1 0.06246 0.02269
star_mass 0.3415 0.06246 1 0.7933
star_radius 0.3532 0.02269 0.7933 1
star_temperature 0.157 0.04783 0.6342 0.4354
  star_temperature
Radius 0.157
Period 0.04783
star_mass 0.6342
star_radius 0.4354
star_temperature 1

Visualization-correlation matrix

c0 <- scale(log10(d[,c(2,3,5,6,7)]))
chart.Correlation(c0, histogram=TRUE, pch=19)

Visualization-correlation matrix

library(psych)
pairs.panels(log10(d[,c(2,3,5,6,7)]), scale=TRUE)

Visualization-Heatmap

Visualization-Heatmap with Dendrograms

Visualization-Dendrograms

Visualization-Chord Diagram

nc <- cor(d[,c(2,3,5,6,7)])
chordDiagram(nc)